library(packrat)
library(tidyverse)
library(magrittr)
library(dplyr)
library(here)
library(ggplot2)
library(wordcloud)
library(wordcloud2)
library(RColorBrewer)
library(tm)
library(tidytext)
library(mapproj)
library(corpus)
library(ggmap)
library(maps)
library(ggrepel)
library(viridis)
here::here()
## [1] "/Users/thiyangashaminitalagala/Lecturer/1_TEACHING/2020_s2/statisticalConsultancyUSJ"
df <- read.csv("data_original.csv")
df <- as_tibble(df)
df %>% head(5)
# A tibble: 5 x 118
ID Consultant DateRetrieved DatePublished Job_title Company R SAS
<int> <chr> <chr> <chr> <chr> <chr> <int> <int>
1 2 Jayani 44020 31/07/2020 Junior D… Dialog… 1 0
2 26 Jayani 13/08/2020 13/08/2020 Lead Dat… Snap I… 0 0
3 27 Jayani 13/08/2020 13/08/2020 Smart Ma… Micron… 0 0
4 28 Jayani 13/08/2020 13/08/2020 Data Sci… Not_de… 0 0
5 29 Jayani 13/08/2020 13/08/2020 Data Sci… PROCTE… 1 0
# … with 110 more variables: SPSS <int>, Python <int>, MAtlab <int>,
# Scala <int>, C_sharp <int>, MS.Word <int>, Ms.Excel <int>, OLE_DB <int>,
# Ms.Access <int>, Ms.PowerPoint <int>, Spreadsheets <int>,
# Data_visualization <int>, Presentation_Skills <int>, Communication <int>,
# BigData <int>, Data_warehouse <int>, cloud_storage <int>,
# Google_Cloud <int>, AWS <int>, Machine_Learning <int>, Deep.Learning <int>,
# Computer_vision <int>, Java <int>, C_plus_plus <int>, C <int>,
# Linux_Unix <int>, SQL <int>, NoSQL <int>, RDBMS <int>, Oracle <int>,
# MySQL <int>, PHP <int>, Flash_Actionscript <int>, SPL <int>,
# web_design_and_development_tools <int>, Wordpress <int>, AI <int>,
# Natural_Language_Processing.NLP. <int>, Microsoft.Power.BI <int>,
# Google_Analytics <int>, graphics_and_design_skills <int>,
# Data_marketing <int>, SEO <int>, Content_Management <int>, Tableau <int>,
# D3 <int>, Alteryx <int>, KNIME <int>, Spotfire <int>, Spark <int>,
# S3 <int>, Redshift <int>, DigitalOcean <int>, Javascript <int>,
# Kafka <int>, Storm <int>, Bash <int>, Hadoop <int>, Data_Pipelines <int>,
# MPP_Platforms <int>, Qlik <int>, Pig <int>, Hive <int>, Tensorflow <int>,
# Map_Reduce <int>, Impala <int>, Solr <int>, Teradata <int>, MongoDB <int>,
# Elasticsearch <int>, YOLO <int>, agile.execution <int>,
# Data_management <int>, pyspark <int>, Data_mining <int>,
# Data_science <int>, Web_Analytic_tools <int>, IOT <int>,
# Numerical_Analysis <int>, Economic <int>, Finance_Knowledge <int>,
# Investment_Knowledge <int>, Problem_Solving <int>, Korean_language <int>,
# Bash.Linux.Scripting <int>, Knowledge_in <chr>, Experience <chr>,
# City <chr>, Location <chr>, Educational_qualifications <chr>, Salary <chr>,
# Team_Handling <int>, Debtor_reconcilation <int>, Payroll_management <int>,
# Bayesian <int>, Optimization <int>, Bahasa.Malaysia <int>,
# English.proficiency <chr>, URL <chr>, Search_Term <chr>, …
df %>% class()
[1] "tbl_df" "tbl" "data.frame"
df %>% ncol()
[1] 118
df %>% colnames()
[1] "ID"
[2] "Consultant"
[3] "DateRetrieved"
[4] "DatePublished"
[5] "Job_title"
[6] "Company"
[7] "R"
[8] "SAS"
[9] "SPSS"
[10] "Python"
[11] "MAtlab"
[12] "Scala"
[13] "C_sharp"
[14] "MS.Word"
[15] "Ms.Excel"
[16] "OLE_DB"
[17] "Ms.Access"
[18] "Ms.PowerPoint"
[19] "Spreadsheets"
[20] "Data_visualization"
[21] "Presentation_Skills"
[22] "Communication"
[23] "BigData"
[24] "Data_warehouse"
[25] "cloud_storage"
[26] "Google_Cloud"
[27] "AWS"
[28] "Machine_Learning"
[29] "Deep.Learning"
[30] "Computer_vision"
[31] "Java"
[32] "C_plus_plus"
[33] "C"
[34] "Linux_Unix"
[35] "SQL"
[36] "NoSQL"
[37] "RDBMS"
[38] "Oracle"
[39] "MySQL"
[40] "PHP"
[41] "Flash_Actionscript"
[42] "SPL"
[43] "web_design_and_development_tools"
[44] "Wordpress"
[45] "AI"
[46] "Natural_Language_Processing.NLP."
[47] "Microsoft.Power.BI"
[48] "Google_Analytics"
[49] "graphics_and_design_skills"
[50] "Data_marketing"
[51] "SEO"
[52] "Content_Management"
[53] "Tableau"
[54] "D3"
[55] "Alteryx"
[56] "KNIME"
[57] "Spotfire"
[58] "Spark"
[59] "S3"
[60] "Redshift"
[61] "DigitalOcean"
[62] "Javascript"
[63] "Kafka"
[64] "Storm"
[65] "Bash"
[66] "Hadoop"
[67] "Data_Pipelines"
[68] "MPP_Platforms"
[69] "Qlik"
[70] "Pig"
[71] "Hive"
[72] "Tensorflow"
[73] "Map_Reduce"
[74] "Impala"
[75] "Solr"
[76] "Teradata"
[77] "MongoDB"
[78] "Elasticsearch"
[79] "YOLO"
[80] "agile.execution"
[81] "Data_management"
[82] "pyspark"
[83] "Data_mining"
[84] "Data_science"
[85] "Web_Analytic_tools"
[86] "IOT"
[87] "Numerical_Analysis"
[88] "Economic"
[89] "Finance_Knowledge"
[90] "Investment_Knowledge"
[91] "Problem_Solving"
[92] "Korean_language"
[93] "Bash.Linux.Scripting"
[94] "Knowledge_in"
[95] "Experience"
[96] "City"
[97] "Location"
[98] "Educational_qualifications"
[99] "Salary"
[100] "Team_Handling"
[101] "Debtor_reconcilation"
[102] "Payroll_management"
[103] "Bayesian"
[104] "Optimization"
[105] "Bahasa.Malaysia"
[106] "English.proficiency"
[107] "URL"
[108] "Search_Term"
[109] "Job_title_New"
[110] "Country"
[111] "Salary_Currency"
[112] "Min_Experience"
[113] "Statistical_software"
[114] "Programming_software"
[115] "database_softwares_and_query_languages"
[116] "Exp1"
[117] "Location_New"
[118] "Min_Educational_qualifications"
df %>% nrow()
[1] 423
df %>% tail(20)
# A tibble: 20 x 118
ID Consultant DateRetrieved DatePublished Job_title Company R SAS
<int> <chr> <chr> <chr> <chr> <chr> <int> <int>
1 86 Thimani 13/8/2020 21/7/2020 Data Sci… Affini… 1 1
2 87 Thimani 13/8/2020 28/7/2020 Data Sci… Intern… 1 1
3 60 Thimani 44051 30/7/2020 Data sci… Deutsc… 1 0
4 61 Thimani 44051 43959 Data Sci… NICE A… 1 1
5 83 Thimani 13/8/2020 25/7/2020 Data Sci… Figure… 0 0
6 84 Thimani 13/8/2020 44172 Data Sci… Predic… 0 0
7 20 Jayani 13/08/2020 13/08/2020 Data Ana… Ernst … 0 0
8 21 Jayani 13/08/2020 44173 Data Ana… Fitch … 0 0
9 161 Piyumika 44082 25/7/2020 Data Ana… E.D.Bu… 0 1
10 162 Piyumika 44082 25/7/2020 Senior D… E.D.Bu… 0 1
11 208 Nimesha 44082 44051 Junior D… Beer52 0 0
12 226 Nimesha 44173 44173 Junior D… Xcede 1 0
13 227 Nimesha 44173 44143 Data Ana… OneMag… 0 0
14 318 Rajitha 44113 43897 Data Sci… Facebo… 1 1
15 320 Rajitha 44113 43897 Data Sci… Apple … 1 0
16 319 Rajitha 44113 44111 Machine … Deloit… 0 0
17 169 Piyumika 44143 44032 Data Sci… Ness T… 0 0
18 170 Piyumika 44143 43971 Full Sta… Ness T… 0 0
19 336 Rajitha 15/09/2020 44112 Data Sci… Eighte… 0 0
20 262 Sanduni 44112 43929 Lead Dat… Target 1 0
# … with 110 more variables: SPSS <int>, Python <int>, MAtlab <int>,
# Scala <int>, C_sharp <int>, MS.Word <int>, Ms.Excel <int>, OLE_DB <int>,
# Ms.Access <int>, Ms.PowerPoint <int>, Spreadsheets <int>,
# Data_visualization <int>, Presentation_Skills <int>, Communication <int>,
# BigData <int>, Data_warehouse <int>, cloud_storage <int>,
# Google_Cloud <int>, AWS <int>, Machine_Learning <int>, Deep.Learning <int>,
# Computer_vision <int>, Java <int>, C_plus_plus <int>, C <int>,
# Linux_Unix <int>, SQL <int>, NoSQL <int>, RDBMS <int>, Oracle <int>,
# MySQL <int>, PHP <int>, Flash_Actionscript <int>, SPL <int>,
# web_design_and_development_tools <int>, Wordpress <int>, AI <int>,
# Natural_Language_Processing.NLP. <int>, Microsoft.Power.BI <int>,
# Google_Analytics <int>, graphics_and_design_skills <int>,
# Data_marketing <int>, SEO <int>, Content_Management <int>, Tableau <int>,
# D3 <int>, Alteryx <int>, KNIME <int>, Spotfire <int>, Spark <int>,
# S3 <int>, Redshift <int>, DigitalOcean <int>, Javascript <int>,
# Kafka <int>, Storm <int>, Bash <int>, Hadoop <int>, Data_Pipelines <int>,
# MPP_Platforms <int>, Qlik <int>, Pig <int>, Hive <int>, Tensorflow <int>,
# Map_Reduce <int>, Impala <int>, Solr <int>, Teradata <int>, MongoDB <int>,
# Elasticsearch <int>, YOLO <int>, agile.execution <int>,
# Data_management <int>, pyspark <int>, Data_mining <int>,
# Data_science <int>, Web_Analytic_tools <int>, IOT <int>,
# Numerical_Analysis <int>, Economic <int>, Finance_Knowledge <int>,
# Investment_Knowledge <int>, Problem_Solving <int>, Korean_language <int>,
# Bash.Linux.Scripting <int>, Knowledge_in <chr>, Experience <chr>,
# City <chr>, Location <chr>, Educational_qualifications <chr>, Salary <chr>,
# Team_Handling <int>, Debtor_reconcilation <int>, Payroll_management <int>,
# Bayesian <int>, Optimization <int>, Bahasa.Malaysia <int>,
# English.proficiency <chr>, URL <chr>, Search_Term <chr>, …
d_1211 <- df$Job_title_New %>% table() %>% as.data.frame()
names(d_1211) <- c('Job_title', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]
d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
## Job_title Frequency Percentage
## 1 DS 211 49.881797
## 2 DA 119 28.132388
## 3 Other 31 7.328605
## 4 DE 20 4.728132
## 5 Analyst 14 3.309693
## 6 Statician 9 2.127660
## 7 Actuarial 8 1.891253
## 8 SP 6 1.418440
## 9 Scientist 5 1.182033
ggplot(d_1211, aes(x= reorder(Job_title, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="Job Title") + geom_bar(stat = "identity", width = 0.5, fill="#FC4E07")+ ggtitle("Bar chart of Job Title") + coord_flip()

d_1211 <- df$Statistical_software %>% table() %>% as.data.frame()
names(d_1211) <- c('Statistical_software', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]
d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
## Statistical_software Frequency Percentage
## 1 R & Python 124 29.314421
## 2 No Statistical softwares 94 22.222222
## 3 Other 91 21.513002
## 4 Python 88 20.803783
## 5 R, SAS & Python 26 6.146572
ggplot(d_1211, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="Statistical software") + geom_bar(stat = "identity", width = 0.5, fill="#CC79A7")+ ggtitle("Bar chart of Statistical software") + coord_flip()

d_1211 <- df$Programming_software %>% table() %>% as.data.frame()
names(d_1211) <- c('Programming_software', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]
d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
## Programming_software Frequency Percentage
## 1 No programming software 349 82.5059102
## 2 Other 43 10.1654846
## 3 Java only 15 3.5460993
## 4 Spark only 15 3.5460993
## 5 1::0::1::0::0::1 1 0.2364066
ggplot(d_1211, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="Programming software") + geom_bar(stat = "identity", width = 0.5, fill="#009E73")+ ggtitle("Bar chart of Programming software") + coord_flip()

d_1211 <- df$database_softwares_and_query_languages %>% table() %>% as.data.frame()
names(d_1211) <- c('database_softwares_and_query_languages', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]
d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
## database_softwares_and_query_languages Frequency Percentage
## 1 No database software and/or query language 169 39.952719
## 2 SQL only 167 39.479905
## 3 Other 35 8.274232
## 4 SQL & MySQL only 27 6.382979
## 5 SQL & Handoop only 13 3.073286
## 6 SQL & NoSQL only 12 2.836879
ggplot(d_1211, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="database softwares and query languages") + geom_bar(stat = "identity", width = 0.5, fill="#56B4E9")+ ggtitle("database softwares and query languages") + coord_flip()

df$Exp1 <- as.factor(df$Exp1)
d12 <- as.data.frame(table(df$Job_title_New,df$Exp1))
names(d12) <- c('Job_title','Min_Experience', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y = No_of_cases, x=Job_title, fill = Min_Experience))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Minimum Experience and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DS") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#CC79A7")+ ggtitle("Bar chart of DS people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Experience") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Statistical_software))
names(d12) <- c('Job_title','Statistical_software', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Statistical_software))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Statistical_software and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Statistical software") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Programming_software))
names(d12) <- c('Job_title','Programming_software', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Programming_software))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Programming_software and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Programming software") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$database_softwares_and_query_languages))
names(d12) <- c('Job_title','database_softwares_and_query_languages', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = database_softwares_and_query_languages))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by database_softwares_and_query_languages and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by database_softwares_and_query_languages") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Min_Educational_qualifications))
names(d12) <- c('Job_title','Min_Educational_qualifications', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Min_Educational_qualifications))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Min_Educational_qualifications and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]
ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Min_Educational_qualifications") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Salary_Currency))
names(d12) <- c('Job_title','Salary_Currency', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Salary_Currency))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Salary_Currency and Job title") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Salary))
names(d12) <- c('Job_title','Salary', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Salary))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Salary and Job title") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Country))
names(d12) <- c('Job_title','Country', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]
df14 <- df13
ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Country))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Country and Job title") + coord_flip()

text3 <- df$URL
docs2 <- Corpus(VectorSource(text3))
docs2 <- docs2%>% tm_map(stripWhitespace) %>% tm_map(removePunctuation) %>% tm_map(removeNumbers)
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
dtm2 <- TermDocumentMatrix(docs2)
matrix2 <- as.matrix(dtm2)
words2 <- sort(rowSums(matrix2), decreasing = TRUE)
df2 <- data.frame(word = names(words2), freq = words2)
p <- wordcloud2(data = df2, size = 0.9,color = 'random-dark', shape = 'pentagon')
p